rm(list=ls())
options(scipen=100,digits=10)

library(rgdal)
library(gdalUtils)
library(raster)
library(dplyr)
library(sf)
library(tidyverse)
library(lfe)
library(rgeos)
library(foreign)
library(lubridate)


## Merge with PM2.5
setwd("working directory")
load("Monitor_AOD.Rdata")
AOD_Monitor=all_data
names(AOD_Monitor)[3]="Date"

setwd("Monitor/PM2.5")
out.file=list.files ()
yearly_data=read.csv(out.file[1])
PM_data=data.frame(matrix(ncol = ncol(yearly_data), nrow = 0))
for(n in 1:length(out.file)){
  # 
  yearly_data=read.csv(out.file[n])
  PM_data=rbind(PM_data,yearly_data)
  print(n)
}
PM_data= PM_data[c("Site.ID" , "Daily.Mean.PM2.5.Concentration", "Date" )]

PM_data$Date=as.Date(PM_data$Date,"%m/%d/%Y")

AOD_Monitor$Date=as.Date(as.character(AOD_Monitor$Date))

data1=left_join(AOD_Monitor, PM_data, by=c("Site.ID","Date"))
names(data1)[4]="PM25"
data1=subset(data1, !(is.na(PM25)))


require(lmerTest)
result=lmer(PM25~1+AOD+(1+AOD|Date)+(1|Site.ID), data=data1)
summary(result)
data1$PM_hat=predict(result, data1)
pred_fit=lm(PM25~PM_hat, data1)
fit=as.data.frame(predict(pred_fit, data1, interval="predict"))
fit$real=data1$PM25
fit$pred=data1$PM_hat
fitted=summary(lm(PM25~PM_hat, data1))
title=paste("Fitted line, y=", round(fitted$coefficients[1],2), "+", 
            round(fitted$coefficients[2],2), "x, R square = ", 
            round(fitted$r.squared,2),'.', sep="")
library(gridExtra)

## Plot Figure 4
wd=getwd()
file.save.path <- file.path(wd, "Predict_PM25.jpg")
jpeg(file=file.save.path,
     width=1000,height=500)
ggplot(fit, aes(x=real, y=pred))+
  geom_point(size=0.15, alpha=0.2)+
  geom_abline(slope = 1, intercept=0,size=1,linetype="dotted", aes(colour="black"))+
  geom_smooth(aes(x=pred, y=fit,linetype='95% confidence interval',colour='mean'),size=0.8)+
  geom_smooth(aes(x=pred, y=lwr, linetype='mean',colour='95% confidence interval'),size=0.6)+
  geom_smooth(aes(x=pred, y=upr,linetype='mean',colour='95% confidence interval'),size=0.6)+
  scale_color_manual(name=title,c('mean','95% confidence interval'), values=c('blue','red'))+
  scale_linetype_manual(name=title,labels=c('mean','95% confidence interval'), values=c('solid','dashed'))+
  xlab("True PM 2.5")+ ylab("Predicted PM 2.5")+
  theme(legend.position="bottom", legend.title = element_text(size=20), legend.text = element_text(size=20),
        axis.text = element_text(size=20), axis.title = element_text(size=20))+
  annotate("text", x=95,y=88, label="45 degree line", size=8)
dev.off()
date_list=unique(data1$Date)
## Overall Effect

load("Main_Data.Rdata")
treat_list=unique(data$OBJECTID[which(data$window==1|data$prod==1)])

DiD_spatial_density=felm(AOD ~ window + prod + window_2 + window_5 + window_10
                         + prod_2 + prod_5 + prod_10 + Treatment_window_count_20km +  Treatment_prod_count_20km + Plug_Count_20KM
                         + PPT + TDMEAN + TMEAN + Wind_Speed | 
                           factor(OBJECTID)+factor(DATE) |0 |clust , data = data)
summary(DiD_spatial_density)
coef=DiD_spatial_density$coefficients
data$overall_effect=data$window*coef[1]+data$prod*coef[2]+data$window_2*coef[3]+data$window_5*coef[4]+data$window_10*coef[5]+data$prod_2*coef[6]+data$prod_5*coef[7]+data$prod_10*coef[8]
data$back=data$AOD-data$overall_effect

pred.data=data[c("OBJECTID","DATE","AOD","overall_effect", "back")]
names(pred.data)[2]=c("Date")


pred.data1=subset(pred.data,Date%in%date_list)

## Give a random Site.ID so that we can do prediction using previous regression, in which Site.ID is involved
## This will not affect our results because the part associated with Site.ID will be cancelled out when calculating effect (Predicted PM - Predicted Background PM cancelled it)
pred.data1$Site.ID=420030064

pred.data1$Pred_PM=predict(result, pred.data1)
names(pred.data1)[3]="True_AOD"
names(pred.data1)[5]="AOD"
pred.data1$Pred_back_PM=predict(result, pred.data1)
names(pred.data1)[5]="Background_AOD"
pred.data1$PM_effect_random=pred.data1$Pred_PM-pred.data1$Pred_back_PM
pred.data1=pred.data1[c("OBJECTID","Date","True_AOD","Background_AOD","overall_effect", "PM_effect_random")]
save(pred.data1, file="Welfare_Effect.Rdata")



## Prepare death and population data
load("Welfare_Effect.Rdata")

## Find blockgroup-well link
well_info=read.csv("well_info.csv")

Block_Group=st_read("PA_2010/PA_blck_grp_2010.shp")

well=well_info[c("OBJECTID","LATITUDE","LONGITUDE")]
coordinates(well)=c("LONGITUDE","LATITUDE")
proj4string(well)=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")
well=st_as_sf(well) 
well=st_transform(well, st_crs(Block_Group))

##Intersect
Block_Group$GISJOIN=paste("G",Block_Group$STATEFP10, "0",Block_Group$COUNTYFP10,
                          "0", Block_Group$TRACTCE10, Block_Group$BLKGRPCE10, sep = "")
Block_Group=Block_Group[c("GISJOIN")]

gg=st_intersects(well,Block_Group)
aa=as.matrix(gg)
gg=as.data.frame(gg)


gg$OBJECTID=well$OBJECTID

gg$Block=Block_Group$GISJOIN[gg$col.id]

gg=gg[3:4]
gg$Block=as.character(gg$Block)


data=left_join(pred.data1, gg, by="OBJECTID")

## Import population data
setwd("Population_BlockGroup")
out.file=list.files ()
data_year= read.csv(paste(out.file[1]))
county_list=data_year[c("GISJOIN","COUNTY")]
col=ncol(data_year)-2
data_year=cbind(data_year[c("GISJOIN", "YEAR")], data_year[,col], data_year$COUNTYA)
names(data_year)[3]="Pop"
names(data_year)[4]="County.Code"
pop=data_year

for(i in 2:length(out.file)){
  data_year= read.csv(paste(out.file[i]))
  col=ncol(data_year)-2
  data_year=cbind(data_year[c("GISJOIN", "YEAR")], data_year[,col], data_year$COUNTYA)
  names(data_year)[3]="Pop"
  names(data_year)[4]="County.Code"
  pop=rbind(pop, data_year)
}

pop$YEAR=substr(pop$YEAR, 6,9)
pop$YEAR=as.numeric(pop$YEAR)
  
## Merge with data
data$YEAR=year(data$Date)

data=subset(data, YEAR>=2010&YEAR<2018)

names(pop)[1]="Block"
pop$Block=as.character(pop$Block)
data=left_join(data, pop, by=c("Block","YEAR"))

# Get mortality
setwd("working directory/Mortality")
Cardio=read.delim("Cardio_2010_2017.txt")
COPD=read.delim("COPD_2010_2017.txt")
all_death=read.delim("all_death_2010_2017.txt")
## Some missing value due to small mortality and confidencial.

Cardio=Cardio[c("County.Code","Year","Deaths","Population")]
names(Cardio)[4]="County_pop"
names(Cardio)[3]="Cardio_deaths"
names(Cardio)[2]="YEAR"
COPD=COPD[c("County.Code","Year","Deaths")]
names(COPD)[3]="COPD_deaths"
names(COPD)[2]="YEAR"
all_death=all_death[c("County.Code","Year","Deaths")]
names(all_death)[3]="all_deaths"
names(all_death)[2]="YEAR"

## Get full county FIPS code
data$County.Code=data$County.Code+42000

data=left_join(data, Cardio, by=c("County.Code","YEAR"))
data=left_join(data, COPD, by=c("County.Code","YEAR"))
data=left_join(data, all_death, by=c("County.Code","YEAR"))

## death ratio
data$Cardio_ratio=data$Cardio_deaths/data$County_pop
data$COPD_ratio=data$COPD_deaths/data$County_pop
data$all_ratio=data$all_deaths/data$County_pop



data_Block=data[c("Block","YEAR","PM_effect_random","Cardio_ratio","COPD_ratio","all_ratio","Pop")]

data_impact=aggregate(data_Block[c("PM_effect_random","Cardio_ratio","COPD_ratio","all_ratio","Pop")], by=list(data_Block$Block,data_Block$YEAR),FUN=mean)
names(data_impact)[1:2]=c("BlockGroup","Year")

data_impact$Cardio_deaths=data_impact$Cardio_ratio*data_impact$Pop
data_impact$COPD_deaths=data_impact$COPD_ratio*data_impact$Pop
data_impact$all_deaths=data_impact$all_ratio*data_impact$Pop

## Extra death of fracking over PA, at least
data_impact$Cardio_impact=data_impact$Cardio_deaths-data_impact$Cardio_deaths/(exp(log(1.26)/10*data_impact$PM_effect_random))
data_impact$COPD_impact=data_impact$COPD_deaths-data_impact$COPD_deaths/(exp(log(1.17)/10*data_impact$PM_effect_random))
data_impact$all_impact=data_impact$all_deaths-data_impact$all_deaths/(exp(log(1.14)/10*data_impact$PM_effect_random))

data_impact$Cardio_impact_low=data_impact$Cardio_deaths-data_impact$Cardio_deaths/(exp(log(1.14)/10*data_impact$PM_effect_random))
data_impact$COPD_impact_low=data_impact$COPD_deaths-data_impact$COPD_deaths/(exp(log(0.85)/10*data_impact$PM_effect_random))
data_impact$all_impact_low=data_impact$all_deaths-data_impact$all_deaths/(exp(log(1.07)/10*data_impact$PM_effect_random))

data_impact$Cardio_impact_high=data_impact$Cardio_deaths-data_impact$Cardio_deaths/(exp(log(1.40)/10*data_impact$PM_effect_random))
data_impact$COPD_impact_high=data_impact$COPD_deaths-data_impact$COPD_deaths/(exp(log(1.62)/10*data_impact$PM_effect_random))
data_impact$all_impact_high=data_impact$all_deaths-data_impact$all_deaths/(exp(log(1.22)/10*data_impact$PM_effect_random))

## Merge county names
names(county_list)[1]="BlockGroup"
county_list$County_Code=substr(county_list$BlockGroup,1,7)
county_list=unique(county_list[c("County_Code","COUNTY")])
data_impact$County_Code=substr(data_impact$BlockGroup,1,7)
data_impact=left_join(data_impact, county_list, by="County_Code", all.x=T)

impact_county=aggregate(data_impact[c("PM_effect_random","Cardio_ratio","COPD_ratio","all_ratio")], by=list(data_impact$Year,data_impact$COUNTY), FUN=mean)
names(impact_county)[1:2]=c("Year","County")
impact_county2=aggregate(data_impact[c("Pop","Cardio_deaths","COPD_deaths","all_deaths", "all_impact")], by=list(data_impact$Year,data_impact$COUNTY), FUN=sum)
names(impact_county2)[1:2]=c("Year","County")
impact_county=left_join(impact_county,impact_county2,by=c("Year","County"))

## Impact by County
## Number of wells by county
well_info$count=1
well_info$treated_count=0
well_info$treated_count[which(well_info$OBJECTID%in%treat_list)]=1
well_per_county=aggregate(well_info[c('count','treated_count')],by=list(well_info$COUNTY), FUN=sum)
names(well_per_county)=c("County","Well_Number","Treated_Well_Number")
well_per_county$County=paste(well_per_county$County, "County",sep = ' ')
impact_county$County=as.character(impact_county$County)
impact_county1=left_join(impact_county,well_per_county,by='County')

## Table 7 
t=2010 #Change the year here to get values for different years
sum(data_impact$Cardio_impact_low[which(data_impact$Year==t)])
sum(data_impact$Cardio_impact_high[which(data_impact$Year==t)])
sum(data_impact$COPD_impact_low[which(data_impact$Year==t)], na.rm=T)
sum(data_impact$COPD_impact_high[which(data_impact$Year==t)], na.rm=T)
sum(data_impact$all_impact_low[which(data_impact$Year==t)])
sum(data_impact$all_impact_high[which(data_impact$Year==t)])

sum(data_impact$Cardio_impact_low)
sum(data_impact$Cardio_impact_high)
sum(data_impact$COPD_impact_low, na.rm=T)
sum(data_impact$COPD_impact_high, na.rm=T)
sum(data_impact$all_impact_low)
sum(data_impact$all_impact_high)
mean(data_impact$Pop)


